# Replication materials for
# Kayser, Mark A., Arndt Leininer Anastasiia Vlasenko (2021): "A Länder-based
# Forecast of the 2021 German Bundestag Election" published in the symposium
# "Forecasting 2021 German elections" in PS: Political Science & Politics

setwd('set your working directory')

library(haven)
data <- read_dta("replication_data.dta")

# Table 1 ----------------------------------------------------------------------

#Unweighted
library(lme4)
summary(m_v <- lmer(voteshare ~ (1 + voteshare_state | state) +
                      (1 | party:state) +
                      voteshare_l + voteshare_state + pm_party + gpya_l +
                      pm_party_xgpya_l + pmyears + pmyears_xpm_party, data,
                    subset = state != 'Bundesgebiet', REML = F)
)

#Reduced unweighted

summary(m_v_nostate <- lmer(voteshare ~ (1 | party:state) +
                              voteshare_l + pm_party + gpya_l +
                              pm_party_xgpya_l + pmyears + pmyears_xpm_party, data,
                            subset = state != 'Bundesgebiet', REML = F)
)

#Weighted
summary(m_w <- lmer(voteshare ~ (1 + voteshare_state | state) +
                      (1 | party:state) +
                      voteshare_l + voteshare_state + pm_party + gpya_l +
                      pm_party_xgpya_l + pmyears + pmyears_xpm_party,
                    weights = weight, data,
                    subset = state != 'Bundesgebiet', REML = F)
)

#calculating R-squared
library(MuMIn)
r.squaredGLMM(m_v)
r.squaredGLMM(m_v_nostate)
r.squaredGLMM(m_w)

# Table 1
library(texreg)
screenreg(list(m_v, m_v_nostate, m_w))

# Table 2 ----------------------------------------------------------------------

# simulate the turnout as well as the forecasting model to generate valid
# measures of uncertainty around prediction

library(foreign)
library(dplyr)
library(nlme)

nsim <- 500

# Turnout ----------------------------------------------------------------------

t <- read_dta("turnout_data.dta")

# model estimated on the original data
m_t <- lmer(turnout ~ (1 + elec_year | state_id) + l_turnout + elec_year +
              state_id, t)

summary(m_t)

# simulation
set.seed(1909)
m_t_sim <- arm::sim(m_t, nsim)

X <- t %>% mutate(intercept = 1) %>%
  select(intercept, l_turnout, elec_year, state_id) %>%
  mutate(state_id = as.character(state_id)) %>%
  filter(elec_year == 2021)

mm <- model.matrix(~ state_id, data = X)

X <- cbind(X, mm[, -1])

predictions_t <- data.frame()

# cycle through countries to include country-specific intercepts & effects

for(s in unique(X$state_id)) {

  #transposition and multiplication
  temp <- t(as.matrix(X[which(X$state_id == s), c(1:3, 5:19)]))
  temp_rot <- t(temp)
  temp_rot <- subset(temp_rot, select = -c(state_id12, state_id13, state_id14, state_id15, state_id5, state_id7, state_id8, state_id9, state_id11))
  temp_rot <- subset(temp_rot, select = -c(state_id16, state_id17, state_id2, state_id3, state_id4))
  temp_rot_2 <- t(temp_rot)

  coef <- fixef(m_t_sim)

  coef[, 1] <- coef[, 1] + ranef(m_t_sim)$state_id[, s, 1]
  coef[, 3] <- coef[, 3] + ranef(m_t_sim)$state_id[, s, 2]

  tmp <- (coef  %*% temp_rot_2+
            rnorm(nsim, mean = 0, sd = attr(VarCorr(m_t), "sc"))) %>%
    data.frame(1:nsim, s, .)
  names(tmp) <- c('simid_t', 'state', 'turnout_pred')
  predictions_t <- bind_rows(predictions_t, tmp)
}
predictions_t <- tbl_df(predictions_t)


predictions_t %>% group_by(state) %>% summarise(mean = mean(turnout_pred),
                                                lwr = quantile(turnout_pred, 0.025),
                                                upr = quantile(turnout_pred, .975))

tmp <-
  data %>% filter(elec_year == 2021, state != 'Bundesgebiet', party == 'spd') %>%
  select(state, electorate)

tmp$state_id <- 2:17
tmp$state_id[tmp$state == "Baden-Württemberg"] <- 1
tmp$state_id[tmp$state == "Bayern"] <- 2
tmp$state_id[tmp$state == "Berlin"] <- 3
tmp$state_id[tmp$state == "Brandenburg"] <- 4
tmp$state_id[tmp$state == "Bremen"] <- 5

# add electorate numbers
predictions_t <- left_join(predictions_t, tmp, by = 'state')
predictions_t <- merge(predictions_t, tmp, by.x = c("state"), by.y = c("state_id"), all.x = TRUE)
predictions_t$electorate <- predictions_t$electorate.y

# calculate the number of valid votes for 2017
predictions_t$valid <- predictions_t$turnout_pred * predictions_t$electorate

# Unweighted model
# Vote shares (state level) ----------------------------------------------------

# Model estimated on the original data -
summary(m_v <- lmer(voteshare ~ (1 + voteshare_state | state) +
                      (1 | party:state) +
                      voteshare_l + voteshare_state + pm_party + gpya_l +
                      pm_party_xgpya_l + pmyears + pmyears_xpm_party, data,
                    subset = state != 'Bundesgebiet', REML = F)
)

# simulation
set.seed(1909)
m_v_sim <- arm::sim(m_v, nsim)

X <- data %>% mutate(intercept = 1) %>%
  select(state, party, elec_year, intercept, voteshare_l, voteshare_state,
         pm_party, gpya_l, pm_party_xgpya_l, pmyears, pmyears_xpm_party) %>%
  filter(elec_year == 2021, state != 'Bundesgebiet')

predictions_v <- data.frame()
# cycle through countries to include country-specific intercepts & effects
for(i in 1:nrow(X)) {
  s <- as.character(X[i, 'state'])
  p <- as.character(X[i, 'party'])

  coef <- fixef(m_v_sim)
  coef[, 1] <- coef[, 1] +
    # ranef(m_v_sim)$`party:state`[, paste(p, s, sep = ':'), 1] +
    ranef(m_v_sim)$state[, s, 1]
  coef[, 3] <- coef[, 3] +
    # ranef(m_v_sim)$`party:state`[, paste(p, s, sep = ':'), 2] +
    ranef(m_v_sim)$state[, s, 2]

  tmp <- (coef %*% t(as.matrix(X[i, -(1:3)])) +
            rnorm(nsim, mean = 0, sd = attr(VarCorr(m_v), "sc"))) %>%
    data.frame(1:nsim, s, p, .)
  names(tmp) <- c('simid_v', 'state', 'party', 'voteshare_pred')
  predictions_v <- bind_rows(predictions_v, tmp)
}
predictions_v <- tbl_df(predictions_v)

predictions_v %>% group_by(state, party) %>%
  summarise(mean = mean(voteshare_pred),
            lwr = quantile(voteshare_pred, .025),
            upr = quantile(voteshare_pred, .975))


# Unweighted model
# Vote shares (federal level) --------------------------------------------------

# merge predictions of turnout and predictions of party vote shares
predictions <- merge(predictions_v, predictions_t, by.x = c("simid_v", "state"), by.y = c("simid_t","state.y"), all.x = TRUE)
# calculate the number of votes received by a party
predictions$votes_hat = predictions$voteshare_pred/100 * predictions$valid
#replace votes_hat_w = voteshare_hat_w/100 * valid if state != "Bundesgebiet"  // weighted model

predictions$simid <-
  paste(predictions$simid_t, predictions$simid_v, sep = '-')

predictions$simid <- predictions$simid_v

# Aggregate to the national level
simdata <- predictions %>% group_by(simid, party) %>%
  summarise(votes_hat_party = sum(votes_hat)) %>%
  group_by(simid) %>%
  mutate(votes_hat_national = sum(votes_hat_party),
         voteshare_hat = votes_hat_party / votes_hat_national * 100)

confint <- simdata %>% group_by(party) %>%
  summarise(mean = round(mean(voteshare_hat), 1),
            lwr = round(quantile(voteshare_hat, .025), 1),
            upr = round(quantile(voteshare_hat, .975), 1))

# Weighted model
# Vote shares (state level) ----------------------------------------------------

# Model estimated on the original data -
summary(m_w <- lmer(voteshare ~ (1 + voteshare_state | state) +
                      (1 | party:state) +
                      voteshare_l + voteshare_state + pm_party + gpya_l +
                      pm_party_xgpya_l + pmyears + pmyears_xpm_party,
                    weights = weight, data,
                    subset = state != 'Bundesgebiet', REML = F)
)

# simulation
set.seed(1909)
m_w_sim <- arm::sim(m_w, nsim)

X_w <- cbind(X, mm[, -1])

X_w <- data %>% mutate(intercept = 1) %>%
  select(state, party, elec_year, intercept, voteshare_l, voteshare_state,
         pm_party, gpya_l, pm_party_xgpya_l, pmyears, pmyears_xpm_party) %>%
  filter(elec_year == 2021, state != 'Bundesgebiet')

predictions_w <- data.frame()
# cycle through countries to include country-specific intercepts & effects
for(i in 1:nrow(X_w)) {
  s <- as.character(X_w[i, 'state'])
  p <- as.character(X_w[i, 'party'])

  coef <- fixef(m_w_sim)
  coef[, 1] <- coef[, 1] +
    # ranef(m_w_sim)$`party:state`[, paste(p, s, sep = ':'), 1] +
    ranef(m_w_sim)$state[, s, 1]
  coef[, 3] <- coef[, 3] +
    # ranef(m_w_sim)$`party:state`[, paste(p, s, sep = ':'), 2] +
    ranef(m_w_sim)$state[, s, 2]

  tmp <- (coef %*% t(as.matrix(X_w[i, -(1:3)])) +
            rnorm(nsim, mean = 0, sd = attr(VarCorr(m_w), "sc"))) %>%
    data.frame(1:nsim, s, p, .)
  names(tmp) <- c('simid_w', 'state', 'party', 'voteshare_pred')
  predictions_w <- bind_rows(predictions_w, tmp)
}
predictions_w <- tbl_df(predictions_w)

predictions_w %>% group_by(state, party) %>%
  summarise(mean = mean(voteshare_pred),
            lwr = quantile(voteshare_pred, .025),
            upr = quantile(voteshare_pred, .975))


# Weighted model
# Vote shares (federal level) --------------------------------------------------

# merge predictions of turnout and predictions of party vote shares
predictions_weight <- merge(predictions_w, predictions_t, by.x = c("simid_w", "state"), by.y = c("simid_t","state.y"), all.x = TRUE)
# calculate the number of votes received by a party
predictions_weight$votes_hat = predictions_weight$voteshare_pred/100 * predictions_weight$valid
#replace votes_hat_w = voteshare_hat_w/100 * valid if state != "Bundesgebiet"  // weighted model

predictions_weight$simid_w <-
  paste(predictions_weight$simid_t, predictions_weight$simid_w, sep = '-')

predictions_weight$simid <- predictions_weight$simid_w

# Aggregate to the national level
simdata_w <- predictions_weight %>% group_by(simid, party) %>%
  summarise(votes_hat_party = sum(votes_hat)) %>%
  group_by(simid) %>%
  mutate(votes_hat_national = sum(votes_hat_party),
         voteshare_hat = votes_hat_party / votes_hat_national * 100)

confint_w <- simdata_w %>% group_by(party) %>%
  summarise(mean = round(mean(voteshare_hat), 1),
            lwr = round(quantile(voteshare_hat, .025), 1),
            upr = round(quantile(voteshare_hat, .975), 1))

#Polling predictions

library(xtable)
library(dplyr)
library(htmltab)
library(stringr)
library(knitr)
library(kableExtra)

names(confint)[2] <- "mean_uw"
names(confint)[3] <- "lwr_uw"
names(confint)[4] <- "upr_uw"

names(confint_w)[2] <- "mean_w"
names(confint_w)[3] <- "lwr_w"
names(confint_w)[4] <- "upr_w"

pred <- merge(confint, confint_w, by.x = c("party"), by.y = c("party"), all.x = TRUE)

pred$party <- c('AfD', 'CDU/CSU', 'Die Linke', 'FDP', 'Bündnis 90/Die Grünen',
                'Others', 'SPD')
#Adding "[]"
library(tidyr)
pred <- pred %>%
  unite("lowup_un", lwr_uw:upr_uw, sep=",")

pred <- pred %>%
  unite("Forecast", mean_uw:lowup_un, sep="[")

pred$Forecast <- paste(pred$Forecast, "]", sep="")

pred <- pred %>%
  unite("lowup_w", lwr_w:upr_w, sep=",")

pred <- pred %>%
  unite("Forecast (weighted model)", mean_w:lowup_w, sep="[")

pred$`Forecast (weighted model)` <- paste(pred$`Forecast (weighted model)`, "]", sep="")


# add in most recent national polling data for 2021 ----------------------------
# the manuscript represents the averages of polls available on wahlrecht.de on
# 7 June 2021
# NB: running the code at a later point in time will obtain current polling
# for that time from the website below
url <- 'http://www.wahlrecht.de/umfragen/index.htm'

library(htmltab)
tab <- htmltab(url, which = 2)

tab <- tab %>% select(1:(ncol(tab) - 1)) %>% filter(grepl('%', INSA)) %>%
  mutate_at(vars(Allensbach:Yougov), str_replace, pattern = ' %', replacement = '') %>%
  mutate_at(vars(Allensbach:Yougov), str_replace, pattern = ',', replacement = '.') %>%
  mutate_at(vars(Allensbach:Yougov), as.numeric) %>%
  mutate(poll = round(rowMeans(.[2:ncol(.)], na.rm = T), 1)) %>%
  select(Institut, poll) %>% rename(party = Institut)

tab$party <- c('CDU/CSU', 'SPD', 'Bündnis 90/Die Grünen', 'FDP', 'Die Linke', 'AfD',
             'Others')

# merge in current polling ----------------------------------------
pred <- merge(pred, tab, by.x = c("party"), by.y = c("party"), all.x = TRUE)
names(pred)[1] <- 'Party'
names(pred)[4] <- 'Polling June 2021'

# add forecast as weighted average ----------------------------------------------

# create weights ---------------------------------------------------------------

library(tidyverse)
library(lubridate)
library(stringr)

elec_date <- ymd('2021-09-26')
start_date <- ymd('2021-06-22')

weights <- tibble(date = as_date(start_date:elec_date),
                  days = (elec_date - start_date):0,
                  lweight = .9 - .4 / as.numeric(elec_date - start_date) * days)

w <- weights %>%
  filter(days == as.numeric(ymd('2021-09-26') - ymd('2021-06-22')))

pred <-
  pred %>%
  mutate(`Hybrid Forecast` =
           as.numeric(str_extract(Forecast, '\\d+\\.\\d')) * (1 - w$lweight) +
           `Polling June 2021` * w$lweight,
         `Hybrid Forecast` = round(`Hybrid Forecast`, 1))

# Table 2
pred

# Figure 1 ---------------------------------------------------------------------

table <- pred

names(table) <- c('party', 'forecast', 'forecast_w', 'polling', 'forecast_hybrid')

table <-
    table %>%
    mutate(forecast = as.numeric(str_extract(forecast, '\\d+\\.\\d')),
           forecast_w = as.numeric(str_extract(forecast_w, '\\d+\\.\\d')))

df <- tibble()

for(i in 1:nrow(table)) {
    tmp <- bind_cols(weights, table[i,])
    df <- bind_rows(df, tmp)
}

df <-
    df %>% mutate(forecast_hybrid = forecast * (1 - lweight) + polling * lweight)

# plot averaged hybrid forecast over time

df$party <- factor(df$party,
                   levels = c('CDU/CSU', 'Bündnis 90/Die Grünen', 'SPD', 'AfD',
                              'FDP', 'Die Linke', 'Others'))

g <-
    df %>% filter(date > ymd('2021-06-22')) %>%
    ggplot(., aes(x = date, y = forecast_hybrid, color = party)) +
    geom_line() +
    scale_color_manual(name = 'Party',
                       values = c('AfD' = 'darkblue',
                                  'Bündnis 90/Die Grünen' = 'darkgreen',
                                  'CDU/CSU' = 'black',
                                  'Die Linke' = 'purple',
                                  'FDP' = 'goldenrod',
                                  'Others' = 'dimgrey',
                                  'SPD' = 'darkred')) +
    scale_x_date(name = '2021', expand = c(0, 0),
                 limits = c(ymd('2021-06-22', ymd('2021-09-26'))),
                 breaks = ymd(c('2021-06-22',
                                c('2021-07-01'),
                                c('2021-08-01'), c('2021-09-01'),
                                c('2021-09-26'))),
                 labels = c('22 June', 'July', 'August',
                            'September', '26 September')) +
    scale_y_continuous(name = 'Hybrid Forecast (%)', limits = c(0, 30)) +
    theme_bw(base_size = 14) + theme(panel.grid.minor = element_blank())

g

